rm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
## [1] "C"
pacman::p_load(dplyr, ggplot2, plotly)
load("rdata/Z.rdata")
library(maps)
## Warning: package 'maps' was built under R version 3.5.3
Brazil = map_data("world") %>% filter(region=="Brazil")

brazilPlot = ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")
# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
Geo = Geo[Geo$geolocation_lat <= 5.27438888,]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
Geo = Geo[Geo$geolocation_lng >= -73.98283055,]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
Geo = Geo[Geo$geolocation_lat >= -33.75116944,]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
Geo = Geo[Geo$geolocation_lng <=  -34.79314722,]

n_distinct(Geo$geolocation_zip_code_prefix)
## [1] 19010
table(Geo$geolocation_zip_code_prefix) %>% table %>% tail
## .
##  832  879  907  965 1102 1146 
##    1    1    1    1    1    1

merge data: Custome + Order + Geolocation + OrderPayment

custlocation = Geo %>% group_by(geolocation_zip_code_prefix) %>% 
  summarise(custlat = max(geolocation_lat),
            custlng = max(geolocation_lng))

COG = OrdPay[!duplicated(OrdPay$order_id),] %>% 
  right_join(Ord, by = "order_id") %>% 
  left_join(Cust, by="customer_id") %>% 
  left_join(custlocation, by=c("customer_zip_code_prefix"="geolocation_zip_code_prefix"))

看每個order都來自哪裡

g = brazilPlot +
  geom_point(data = COG,aes(x=custlng,y=custlat,color=customer_state),size=0.2)
ggplotly(g)

單純直接把Geo這個資料集畫出來

brazilPlot +
  geom_point(data = Geo, aes(x=geolocation_lng, y=geolocation_lat, color=geolocation_state),size=0.2)

States = COG %>% group_by(customer_state) %>% 
  summarise(
    noCust = n_distinct(customer_id),                              # 總共有幾個顧客
    noOrder = n(),                                                 # 總共有幾筆交易
    noItem = sum(order_item_count),                                # 總共買幾個產品
    avgItem = mean(order_item_count),                              # 平均每個交易買幾個產品
    totalRevenue = sum(order_item_value),                          # 總共Revenue
    avgRevenue = mean(order_item_value),                           # 平均每筆交易的Revenue (客單價)
    avgFreight = mean(order_freight_value),                        # 平均每筆交易的freight運費
    avgDeliveryDays = mean(delivery_days),                         # 平均運送天數
    cancelStatusProportion = mean(order_status=="canceled"),       # 棄單比率
    deliveredStatusProportion = mean(order_status=="delivered"),   # 成單比率
    payType_boleto = mean(payment_type == "boleto"),               # boleto付款比率
    payType_CreditCard = mean(payment_type == "credit_card"),      # credit card付款比率
    payType_debitCard = mean(payment_type == "debit_card"),        # debit card付款比率
    payType_voucher = mean(payment_type == "voucher")              # voucher付款比率
    )
States = Geo %>% group_by(geolocation_state) %>% 
  summarise(lng = mean(geolocation_lng),
            lat = mean(geolocation_lat)) %>% 
  right_join(., States, by = c("geolocation_state" = "customer_state"))

summary(States)
##  geolocation_state      lng              lat              noCust     
##  AC     : 1        Min.   :-68.45   Min.   :-29.680   Min.   :   46  
##  AL     : 1        1st Qu.:-51.63   1st Qu.:-19.988   1st Qu.:  378  
##  AM     : 1        Median :-47.97   Median :-10.341   Median :  903  
##  AP     : 1        Mean   :-47.53   Mean   :-12.453   Mean   : 3654  
##  BA     : 1        3rd Qu.:-40.03   3rd Qu.: -5.806   3rd Qu.: 2742  
##  CE     : 1        Max.   :-35.76   Max.   :  2.717   Max.   :41375  
##  (Other):21                                                          
##     noOrder          noItem           avgItem       totalRevenue    
##  Min.   :   46   Min.   :   52.0   Min.   :1.080   Min.   :   7829  
##  1st Qu.:  378   1st Qu.:  414.5   1st Qu.:1.114   1st Qu.:  69618  
##  Median :  903   Median : 1055.0   Median :1.131   Median : 156454  
##  Mean   : 3654   Mean   : 4172.2   Mean   :1.132   Mean   : 503394  
##  3rd Qu.: 2742   3rd Qu.: 3102.5   3rd Qu.:1.147   3rd Qu.: 406977  
##  Max.   :41375   Max.   :47449.0   Max.   :1.206   Max.   :5202955  
##                                                                     
##    avgRevenue      avgFreight    avgDeliveryDays cancelStatusProportion
##  Min.   :125.8   Min.   :17.37   Min.   : NA     Min.   :0.000000      
##  1st Qu.:143.5   1st Qu.:24.89   1st Qu.: NA     1st Qu.:0.001069      
##  Median :164.8   Median :36.44   Median : NA     Median :0.003034      
##  Mean   :164.1   Mean   :34.39   Mean   :NaN     Mean   :0.003393      
##  3rd Qu.:177.1   3rd Qu.:41.53   3rd Qu.: NA     3rd Qu.:0.004120      
##  Max.   :216.7   Max.   :48.59   Max.   : NA     Max.   :0.021739      
##                                  NA's   :27                            
##  deliveredStatusProportion payType_boleto   payType_CreditCard
##  Min.   :0.8913            Min.   :0.1429   Min.   :0.6765    
##  1st Qu.:0.9693            1st Qu.:0.1764   1st Qu.:0.7182    
##  Median :0.9789            Median :0.2068   Median :0.7453    
##  Mean   :0.9744            Mean   :0.2126   Mean   :0.7496    
##  3rd Qu.:0.9839            3rd Qu.:0.2475   3rd Qu.:0.7802    
##  Max.   :0.9887            Max.   :0.2941   Max.   :0.8367    
##                            NA's   :1        NA's   :1         
##  payType_debitCard payType_voucher  
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01166   1st Qu.:0.02181  
##  Median :0.01380   Median :0.02713  
##  Mean   :0.01316   Mean   :0.02463  
##  3rd Qu.:0.01537   3rd Qu.:0.02930  
##  Max.   :0.02469   Max.   :0.03854  
##  NA's   :1         NA's   :1
g =  brazilPlot +
  geom_point(data = States,
             aes(x=lng, y=lat, color = cancelStatusProportion, size = noOrder)
  )
ggplotly(g)
g = ggplot(States, aes(x=avgItem, y=avgRevenue, col=avgDeliveryDays)) +
  geom_point(aes(size=noCust)) +
  geom_text(aes(label=geolocation_state), size=4, check_overlap=T, nudge_y = 3)
ggplotly(g)